-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU][True16][CodeGen] Add patterns to reduce intermediates #162047
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Add patterns which reduce or operations to register sequences when combining i16 values to i32. This removes many intermediate VGPRs and reduces registers pressure.
@llvm/pr-subscribers-backend-amdgpu Author: Carl Ritson (perlfu) ChangesAdd patterns which reduce or operations to register sequences when combining i16 values to i32. This removes many intermediate VGPRs and reduces registers pressure. Patch is 1.16 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/162047.diff 14 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index be084a952bc41..c7fa49a2e1d64 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3717,6 +3717,23 @@ def : GCNPat <
} // End foreach Ty = ...
} // End AddedComplexity = 1
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat<
+ (i32 (DivergentBinFrag<or>
+ (i32 (zext i16:$src_lo)),
+ (i32 (bitconvert (v2i16 (build_vector (i16 0), (i16 VGPR_32:$src_hi)))))
+ )),
+ (REG_SEQUENCE VGPR_32, $src_lo, lo16, $src_hi, hi16)
+>;
+def : GCNPat<
+ (i32 (DivergentBinFrag<or>
+ (i32 (bitconvert (v2i16 (build_vector (i16 0), (i16 VGPR_32:$src_hi))))),
+ (i32 (zext i16:$src_lo))
+ )),
+ (REG_SEQUENCE VGPR_32, $src_lo, lo16, $src_hi, hi16)
+>;
+}
+
let True16Predicate = UseRealTrue16Insts in
def : GCNPat <
(v2i16 (DivergentBinFrag<build_vector> (i16 undef), (i16 (trunc i32:$src1)))),
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index df9c97fa23722..117af9590ff6e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -6551,271 +6551,205 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v160.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v33.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v36.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v149.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v148.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v146.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v54.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v145.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v82.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v81.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v71.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v69.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v133.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v132.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v131.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v130.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v134.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v9.h, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v129.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v119.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v118.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v10.l, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v117.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v116.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v115.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v114.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v113.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v112.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v103.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h,...
[truncated]
|
def : GCNPat< | ||
(i32 (DivergentBinFrag<or> | ||
(i32 (zext i16:$src_lo)), | ||
(i32 (bitconvert (v2i16 (build_vector (i16 0), (i16 VGPR_32:$src_hi))))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shouldn't this be (i16 VGPR_16:$src_hi)
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Many of the i16/f16 patterns use VGPR_32 -- is patterns above this one for examples.
In practice it doesn't seem to make any difference at this point in instruction selection, so I have change these to VGPR_16.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, in practice I think the register class there will be ignored; it's only the i16 there that matters. LGTM
Add patterns which reduce or operations to register sequences when combining i16 values to i32. This removes many intermediate VGPRs and reduces registers pressure.